Installing Required packages¶

In [ ]:
!pip install pymongo lifelines pandas
!pip install maxminddb-geolite2 folium

Importing The Required Packages¶

In [2]:
import pandas as pd
from pymongo import MongoClient
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from geolite2 import geolite2
import folium
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go
from scipy import stats
from sklearn.ensemble import IsolationForest
from scipy.stats import ttest_ind, mannwhitneyu

Accessing the data from the mongodb¶

  • Spiliting the Node Data And Message Data
  • And Storing the Data to the lists
In [3]:
#connecting the database
client = MongoClient('mongodb://localhost:27017/')

# select the database and collection
db = client['major']
collection = db['data']

data = collection.find()
nodes=[]
messages=[]
for d in data:
    d.pop('_id')  # remove the '_id' field from each document
    if 'port' in d:
        nodes.append(d)
    else:
        messages.append(d)

Createing the DataFrame With Message¶

  • Extracting all the three columns in payload set
  • Inserting the payload data to the dataframe
In [4]:
df = pd.DataFrame(messages)
df = pd.concat([df.drop(['payload'], axis=1), df['payload'].apply(pd.Series)], axis=1)
df
Out[4]:
sender receiver date temperature wind speed
0 832a1898-0575-4d8d-b961-bae784b6dd0d 68703395-5a6a-4620-aecb-88a47253a781 2022-12-28 10:07:31 24 9 157
1 b1bfb6ed-b2e7-46b3-ac2e-2ee5e14c0799 0ae2f8c1-ac4f-417b-a9ef-80173aea3a79 2022-10-28 02:35:52 23 1 122
2 2ca1f8d3-b7bc-4e71-91b7-98c0ed150783 e70fb74d-5beb-485a-a37e-0658af2a3cbe 2022-12-18 10:38:19 27 9 179
3 8eea24ec-6941-4b3f-8337-7999f5530602 0721080a-0ddf-46c1-a6bf-b5cc6df41861 2023-01-20 02:03:36 29 7 195
4 42b5e43c-dbf0-4696-bfb8-15a4efbba101 028a248c-52c2-4c7c-a387-aaf5f658331b 2022-10-21 15:11:48 23 1 185
... ... ... ... ... ... ...
595 a9aba850-a62a-4434-a628-d2c8c4c3fdd6 8a712d8b-5453-47ba-8d64-abf1d11d1c1b 2023-01-15 22:02:32 27 14 105
596 0c81bc4a-db3b-412f-8514-afd31a96913d d91286d0-b637-472d-947e-f3940fde6844 2023-01-23 04:48:57 27 1 186
597 62d65c20-33ec-426a-b089-9fb4d7d6a051 158130d9-d724-4987-bc7a-b14c26b97252 2023-01-01 22:05:55 26 0 135
598 96096f13-34c6-48d8-a556-30fb2c851220 6754c66b-1fd7-4657-a4ca-f22575921f82 2022-11-21 13:37:12 22 16 194
599 c5f82214-7c9f-49a3-9da9-eb2c42e98e29 d56f3755-83d3-4d6c-a14b-d70dfe424a27 2022-12-27 01:34:06 26 16 198

600 rows × 6 columns

Creating a DataFrame With The Nodes Data¶

In [5]:
df1 = pd.DataFrame(nodes)
df1
Out[5]:
id port addr peers honest
0 b1bfb6ed-b2e7-46b3-ac2e-2ee5e14c0799 29464 181.116.72.251 [131.46.126.190, 137.218.170.21, 166.165.61.15... true
1 84d73484-be89-4dba-9a2d-5906218690e6 36666 28.46.162.194 [105.122.9.123, 51.49.234.131, 38.152.179.69, ... true
2 7096e26b-e4ad-4372-b8c1-ad678af802cf 27253 137.218.170.21 [23.104.239.48, 176.127.36.206, 51.49.234.131,... true
3 0998bce7-f2a6-4209-8f27-372d5dad026c 35977 49.171.135.105 [227.232.138.74, 176.109.174.69, 87.101.29.237... true
4 8ccff98e-b2d0-4eef-9e6f-25b9cf12e5fe 25112 184.193.5.40 [176.127.36.206, 8.178.157.69, 233.167.126.226... true
... ... ... ... ... ...
595 f65090be-293c-461c-89d0-4aeed84cecf1 29832 5.23.123.237 [214.194.1.209, 234.82.244.37, 211.7.121.116, ... true
596 f24d7eef-9f50-4d69-bc27-683c4cc69d15 21501 84.107.249.36 [85.252.182.39, 186.34.67.36, 3.37.19.199, 121... true
597 66a38013-24fd-4806-8205-bd0a5bb96093 33913 189.71.8.217 [180.231.161.196, 242.76.202.173, 5.23.123.237... true
598 bf0b0a77-79e5-4e16-97a5-9bdc4f14fd44 25133 202.15.10.205 [59.56.4.168, 22.133.131.87, 12.85.125.43, 147... true
599 7af3d73c-5318-438f-bae4-89ecfb0a7e0c 31701 196.63.212.40 [32.199.106.14, 124.108.203.173, 68.205.186.33... true

600 rows × 5 columns

Merge Both Message Data And Node Data Based on Sender and Id¶

In [6]:
merged_df = pd.merge(df, df1, left_on='sender', right_on='id')
merged_df=merged_df.drop(['id'],axis=1)
merged_df
Out[6]:
sender receiver date temperature wind speed port addr peers honest
0 832a1898-0575-4d8d-b961-bae784b6dd0d 68703395-5a6a-4620-aecb-88a47253a781 2022-12-28 10:07:31 24 9 157 32692 66.13.28.163 [197.143.62.237, 233.167.126.226, 54.199.60.13... true
1 832a1898-0575-4d8d-b961-bae784b6dd0d ed59c1df-f324-4cf2-9f97-4a75a1a1cfd0 2022-11-17 15:24:22 30 1 120 32692 66.13.28.163 [197.143.62.237, 233.167.126.226, 54.199.60.13... true
2 832a1898-0575-4d8d-b961-bae784b6dd0d 97d71e1b-7861-4d75-82ac-f651c5268ce4 2022-10-11 00:00:51 25 7 183 32692 66.13.28.163 [197.143.62.237, 233.167.126.226, 54.199.60.13... true
3 832a1898-0575-4d8d-b961-bae784b6dd0d 2fd37557-de7d-4607-876f-45703e7020dd 2022-12-24 16:52:39 27 14 188 32692 66.13.28.163 [197.143.62.237, 233.167.126.226, 54.199.60.13... true
4 832a1898-0575-4d8d-b961-bae784b6dd0d a26cab79-a60f-4675-aadb-cdcb3c0c3d3c 2022-12-27 06:25:43 28 14 162 32692 66.13.28.163 [197.143.62.237, 233.167.126.226, 54.199.60.13... true
... ... ... ... ... ... ... ... ... ... ...
595 b770885b-7b27-40f5-96a9-2e489632ee7f 158130d9-d724-4987-bc7a-b14c26b97252 2022-09-13 23:34:52 28 0 161 35785 22.133.131.87 [217.154.191.84, 150.24.66.160, 87.196.152.101... true
596 43e77632-c742-497f-a4c2-293e77d8c813 67835d5b-8330-4001-b360-92a822f8164c 2023-01-26 02:17:14 29 5 139 32690 249.138.143.121 [153.97.177.85, 235.226.18.8, 214.10.134.179, ... true
597 f45b8b9e-e46e-442d-b12b-c8d2827e0df6 510dfdf7-e0a3-4d8c-9e13-a2f849f00f12 2023-01-04 06:59:20 27 0 168 25344 214.10.134.179 [20.46.174.135, 87.196.152.101, 64.229.137.249... true
598 a9aba850-a62a-4434-a628-d2c8c4c3fdd6 8a712d8b-5453-47ba-8d64-abf1d11d1c1b 2023-01-15 22:02:32 27 14 105 20459 28.97.2.20 [7.91.194.236, 201.116.9.178, 225.140.65.179, ... true
599 96096f13-34c6-48d8-a556-30fb2c851220 6754c66b-1fd7-4657-a4ca-f22575921f82 2022-11-21 13:37:12 22 16 194 31264 129.28.19.33 [201.116.9.178, 215.91.254.245, 44.143.239.75,... true

600 rows × 10 columns

Correlation Analysis¶

Correlation refers to the strength of a relationship between two variables, and is typically measured by a correlation coefficient. In the context of a DataFrame, correlation can be used to determine whether there is a linear relationship between two columns of data.

The correlation coefficient, which is a value between -1 and 1, indicates the strength and direction of the relationship between two variables. A correlation coefficient of 1 indicates a perfect positive relationship, where one variable increases as the other variable increases. A correlation coefficient of -1 indicates a perfect negative relationship, where one variable decreases as the other variable increases. A correlation coefficient of 0 indicates no relationship between the two variables.

In [7]:
def correlation(column1,column2):
    correlation = merged_df[column1].corr(merged_df[column2])
    return(f"Correlation between {column1} and {column2}:", correlation)

Correlation Between temperature and wind¶

In [8]:
correlation('temperature','wind')
Out[8]:
('Correlation between temperature and wind:', 0.09709881763028405)

Correlation Between wind and speed¶

In [9]:
correlation('wind','speed')
Out[9]:
('Correlation between wind and speed:', 0.08819521317235023)

Correlation Between temperature and speed¶

In [10]:
correlation('temperature','speed')
Out[10]:
('Correlation between temperature and speed:', 0.050831862956256545)

The Count Of Communication Between Each Sender And Receiver¶

In [11]:
frequency_table = merged_df.groupby(['sender', 'receiver']).size().reset_index(name='count')
print("Frequency table of sender-receiver pairs:")
frequency_table
Frequency table of sender-receiver pairs:
Out[11]:
sender receiver count
0 0385da65-9ab0-46ac-a6d2-462041bf1fe1 08f8f0e5-8f6e-4893-be5d-23276887a98e 1
1 0385da65-9ab0-46ac-a6d2-462041bf1fe1 fd59b68b-1b09-4b0b-b167-ac7d9d11d6b8 1
2 040a8064-3026-4249-82b8-75c8a1aae388 aad31313-444b-4477-9df1-7806b0ae31af 1
3 06d4ce0d-44bc-430d-807f-c79765424e6b 847a2241-ea9c-4755-ae12-6e1e56937fc3 1
4 0721080a-0ddf-46c1-a6bf-b5cc6df41861 42b5e43c-dbf0-4696-bfb8-15a4efbba101 1
... ... ... ...
594 fb2839d3-a266-46e6-a048-823b6706b1f6 46b360be-fc07-47fa-a9dc-fdc3bfab313e 1
595 fb2839d3-a266-46e6-a048-823b6706b1f6 65515120-efbd-42c7-a78c-d106e2d50818 1
596 fde2d368-f04d-4072-b04e-2980b0583999 1f622d8e-f2d2-46a4-a1ad-87eb2cbd2aa0 1
597 fde2d368-f04d-4072-b04e-2980b0583999 e90c46d7-deb8-4061-82c2-634bd21a0f8b 1
598 fee60cd9-ff19-4f84-a156-03d99a91006e 9c6b6086-7846-4693-84ae-0888fda2892a 1

599 rows × 3 columns

Trends in temperature, wind, and speed over time.¶

In [12]:
# Convert the date column to datetime format
merged_df['date'] = pd.to_datetime(merged_df['date'])

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 8))

merged_df.plot(x='date', y='temperature', marker='o', ax=axes[0])
merged_df.plot(x='date', y='wind', marker='o', ax=axes[1])
merged_df.plot(x='date', y='speed', marker='o', ax=axes[2])

# Set the title and labels for the plots
axes[0].set_title('Temperature over Time')
axes[1].set_title('Wind over Time')
axes[2].set_title('Speed over Time')

fig.suptitle('Trends in Temperature, Wind, and Speed over Time')
plt.xlabel('Date')
plt.tight_layout()
plt.show()

Seasonal patterns in temperature and wind¶

In [13]:
# Extract month and year information from the date
merged_df['month'] = merged_df['date'].dt.month
merged_df['year'] = merged_df['date'].dt.year

# Create boxplots for temperature and wind by month
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.boxplot(x='month', y='temperature', data=merged_df, ax=axes[0])
axes[0].set_title('Seasonal Patterns in Temperature')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Temperature')

sns.boxplot(x='month', y='wind', data=merged_df, ax=axes[1])
axes[1].set_title('Seasonal Patterns in Wind')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Wind')

plt.show()

Most frequently used sender and receiver addresses.¶

In [14]:
# Create a directed graph using NetworkX
G = nx.DiGraph()

# Add edges to the graph for each sender-receiver pair
for sender, receiver in zip(merged_df['sender'], merged_df['receiver']):
    G.add_edge(sender, receiver)

# Calculate the most frequently used sender and receiver addresses
most_frequent_senders = merged_df['sender'].value_counts().head(5)
most_frequent_receivers = merged_df['receiver'].value_counts().head(5)

# Create a subgraph of the most frequently used sender and receiver addresses
subgraph_nodes = set(most_frequent_senders.index) | set(most_frequent_receivers.index)
subgraph = G.subgraph(subgraph_nodes)

# Set node size based on degree centrality
node_sizes = [10000 * subgraph.degree(node) for node in subgraph.nodes()]

# Set node colors based on degree centrality
node_colors = [subgraph.degree(node) for node in subgraph.nodes()]

# Draw the network graph using Matplotlib
pos = nx.spring_layout(subgraph, k=5)
nx.draw_networkx_nodes(subgraph, pos, node_color=node_colors, node_size=node_sizes)
nx.draw_networkx_edges(subgraph, pos)
nx.draw_networkx_labels(subgraph, pos)
plt.axis('off')
plt.show()
In [15]:
# Create a directed graph using NetworkX
df_half = merged_df.sample(frac=0.05)
G = nx.DiGraph()

# Add edges to the graph for each sender-receiver pair
for sender, receiver, date, port in zip(df_half['sender'], df_half['receiver'], df_half['date'], df_half['port']):
    G.add_edge(sender, receiver, date=date, port=port)

# Add edges to the graph for nodes that communicated on the same date
for date in df_half['date'].unique():
    nodes_on_date = set(df_half[df_half['date'] == date]['sender']).union(set(df_half[df_half['date'] == date]['receiver']))
    for node1 in nodes_on_date:
        for node2 in nodes_on_date:
            if node1 != node2 and not G.has_edge(node1, node2):
                G.add_edge(node1, node2, date=date)

# Add edges to the graph for nodes that communicated on the same port
for port in df_half['port'].unique():
    nodes_on_port = set(df_half[df_half['port'] == port]['sender']).union(set(df_half[df_half['port'] == port]['receiver']))
    for node1 in nodes_on_port:
        for node2 in nodes_on_port:
            if node1 != node2 and not G.has_edge(node1, node2):
                G.add_edge(node1, node2, port=port)

# Draw the network graph using Matplotlib
pos = nx.spring_layout(G, k=0.5)
nx.draw_networkx(G, pos, with_labels=True, node_size=500, font_size=8)
plt.axis('off')
plt.show()

Analysis of the port distribution.¶

In [16]:
port_counts = merged_df['port'].value_counts()

# Create a bar chart of the distribution of port numbers using Plotly Express
fig = px.bar(y=port_counts.index, x=port_counts.values, labels={'y': 'Port', 'x': 'Frequency'},
             title='Distribution of Port Numbers',template='plotly_white')

# Show the plot
fig.show()

Predicting temperature, wind, or speed based on date and other features¶

Preprocessing the data: a. Converting the date column to datetime format. b. Extracting relevant features from the date, such as day of the week and month. c. Encoding the categorical feature (sender) using label encoding.

Defining the features (X) and target (y) for the prediction task. In this example, the target variable is 'temperature', and the features include the day of the week, month, wind, speed, and the encoded sender information.

Splitting the dataset into a training set and a testing set. The training set is used to train the machine learning model, while the testing set is used to evaluate its performance.

Training a RandomForestRegressor model on the training set. The RandomForestRegressor is an ensemble learning method for regression tasks that operate by constructing multiple decision trees during training and outputting the mean prediction of the individual trees.

Predicting temperature values for the test set using the trained RandomForestRegressor model.

Calculating the mean absolute error (MAE) of the predictions. The MAE is a measure of the average difference between the true temperature values and the predicted values in the test set. It gives an idea of how well the model performs on unseen data.

In [17]:
# Convert the date column to datetime format and extract relevant features
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['day_of_week'] = merged_df['date'].dt.dayofweek
merged_df['month'] = merged_df['date'].dt.month

# Encode the categorical feature (sender)
encoder = LabelEncoder()
merged_df['sender_encoded'] = encoder.fit_transform(merged_df['sender'])

# Define the features (X) and target (y)
X = merged_df[['day_of_week', 'month', 'wind', 'speed', 'sender_encoded']]
y = merged_df['temperature']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict temperature using the test set
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
Mean Absolute Error: 2.7030000000000003

Predicting the honest attribute¶

In [18]:
# Define the features (X) and target (y)
X = merged_df[['day_of_week', 'month', 'temperature', 'wind', 'speed', 'sender_encoded']]
y = merged_df['honest']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict the honest attribute using the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9083333333333333

geographical Address¶

the 'addr' feature represents geographical IP addresses, you can convert them into coordinates (latitude and longitude) and analyze the spatial distribution of data points.

In [19]:
# Initialize the GeoLite2 reader
reader = geolite2.reader()

# Function to get coordinates from IP addresses
def get_coordinates(ip):
    location_data = reader.get(ip)
    if location_data and 'location' in location_data:
        return location_data['location']['latitude'], location_data['location']['longitude']
    return None, None

# Get coordinates for each IP address
merged_df['latitude'], merged_df['longitude'] = zip(*merged_df['addr'].apply(get_coordinates))

# Close the GeoLite2 reader
geolite2.close()

# Create a map centered at the mean latitude and longitude values
map_center = [merged_df['latitude'].mean(), merged_df['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=2)

# Add markers for each IP address
for index, row in merged_df.iterrows():
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        folium.Marker([row['latitude'], row['longitude']], tooltip=row['addr']).add_to(m)

# Display the map
m
Out[19]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [20]:
# Create a Scatter Mapbox plot
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', hover_name='addr',
                        mapbox_style='carto-positron', zoom=2)

# Show the plot
fig.show()

Group similar data points together using unsupervised learning techniques like K-means, DBSCAN, or hierarchical clustering.¶

In [21]:
# Define the number of clusters (you can determine the optimal number using methods like the Elbow method)
n_clusters = 3
merged_df.dropna(inplace=True)
# Create a KMeans model
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit the model using latitude and longitude
coordinates = merged_df[['latitude', 'longitude']]
kmeans.fit(coordinates)

# Get cluster labels
merged_df['cluster'] = kmeans.labels_

# Plot the clustered data points
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='cluster',
                        hover_name='addr', mapbox_style='carto-positron', zoom=2,
                        title='K-Means Clustering')
fig.show()
C:\ProgramData\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

C:\ProgramData\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.

In [22]:
scaler = StandardScaler()
coordinates = merged_df[['latitude', 'longitude']]
scaled_coordinates = scaler.fit_transform(coordinates)
dbscan = DBSCAN(eps=1, min_samples=2)
dbscan.fit(scaled_coordinates)
merged_df['dbscan_cluster'] = dbscan.labels_
agg = AgglomerativeClustering(n_clusters=3)
agg.fit(scaled_coordinates)
merged_df['agg_cluster'] = agg.labels_
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='dbscan_cluster',
                        hover_name='addr', mapbox_style='carto-positron', zoom=2,
                        title='DBSCAN Clustering')
fig.show()

fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='agg_cluster',
                        hover_name='addr', mapbox_style='carto-positron', zoom=2,
                        title='Agglomerative Clustering')
fig.show()

Analyze the characteristics of each cluster to identify patterns and trends.¶

In [23]:
scaler = StandardScaler()
coordinates = merged_df[['latitude', 'longitude']]
scaled_coordinates = scaler.fit_transform(coordinates)

agg = AgglomerativeClustering(n_clusters=3)
agg.fit(scaled_coordinates)
merged_df['agg_cluster'] = agg.labels_
def plot_box(df, feature, cluster_col):
    fig = px.box(merged_df, x=cluster_col, y=feature, title=f'{feature.capitalize()} Distribution by Cluster')
    fig.show()

plot_box(merged_df, 'temperature', 'agg_cluster')
plot_box(merged_df, 'speed', 'agg_cluster')
cluster_summary = merged_df.groupby('agg_cluster').agg({
    'temperature': ['mean', 'median', 'std'],
    'speed': ['mean', 'median', 'std']
})

print(cluster_summary)
            temperature                        speed                  
                   mean median       std        mean median        std
agg_cluster                                                           
0             24.867521   25.0  3.134258  150.021368  150.0  28.719368
1             24.830935   25.0  3.113412  149.467626  149.0  29.771404
2             25.160000   25.0  3.197395  158.200000  155.0  30.575044

Identify unusual data points or outliers in features like temperature, wind, and speed using techniques like IQR, Z-score¶

In [24]:
def iqr_outliers(merged_df, column):
    Q1 = merged_df[column].quantile(0.25)
    Q3 = merged_df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = merged_df[(merged_df[column] < lower_bound) | (merged_df[column] > upper_bound)]
    return outliers

print("IQR Outliers:")
print("Temperature:", iqr_outliers(merged_df, 'temperature'))
print("Wind:", iqr_outliers(merged_df, 'wind'))
print("Speed:", iqr_outliers(merged_df, 'speed'))
IQR Outliers:
Temperature: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []
Wind: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []
Speed: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []
In [25]:
def z_score_outliers(df, column, threshold=2):
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers

print("\nZ-score Outliers:")
print("Temperature:", z_score_outliers(merged_df, 'temperature'))
print("Wind:", z_score_outliers(merged_df, 'wind'))
print("Speed:", z_score_outliers(merged_df, 'speed'))
Z-score Outliers:
Temperature: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []
Wind: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []
Speed: Empty DataFrame
Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster]
Index: []

correlation_matrix for all the data¶

In [26]:
correlation_matrix = merged_df.corr()
correlation_matrix
C:\Users\sgk\AppData\Local\Temp\ipykernel_14512\3842175115.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[26]:
temperature wind speed port month year day_of_week sender_encoded latitude longitude cluster dbscan_cluster agg_cluster
temperature 1.000000 0.089504 0.040735 -0.013230 -0.040469 0.054897 -0.011577 0.060711 0.019469 -0.016675 0.030297 0.011943 0.006436
wind 0.089504 1.000000 0.091357 0.015203 0.028749 -0.056781 0.028394 -0.043638 0.042026 0.045152 0.042482 -0.027122 -0.064909
speed 0.040735 0.091357 1.000000 0.084844 -0.004437 0.015983 -0.067476 0.065391 -0.039479 0.021823 -0.000241 -0.054323 0.024742
port -0.013230 0.015203 0.084844 1.000000 0.003070 0.001866 0.000703 0.044636 -0.099132 -0.018504 -0.016333 -0.023332 0.078715
month -0.040469 0.028749 -0.004437 0.003070 1.000000 -0.965079 -0.072037 -0.001030 -0.054375 -0.029413 -0.040569 -0.014832 0.064478
year 0.054897 -0.056781 0.015983 0.001866 -0.965079 1.000000 0.061758 0.004424 0.031919 0.029090 0.024055 0.021042 -0.048618
day_of_week -0.011577 0.028394 -0.067476 0.000703 -0.072037 0.061758 1.000000 0.023619 0.022731 -0.024866 -0.012000 -0.064942 -0.000542
sender_encoded 0.060711 -0.043638 0.065391 0.044636 -0.001030 0.004424 0.023619 1.000000 -0.021767 -0.041993 0.027412 -0.070786 0.066477
latitude 0.019469 0.042026 -0.039479 -0.099132 -0.054375 0.031919 0.022731 -0.021767 1.000000 -0.035060 0.241434 0.170454 -0.522622
longitude -0.016675 0.045152 0.021823 -0.018504 -0.029413 0.029090 -0.024866 -0.041993 -0.035060 1.000000 0.651369 -0.082993 -0.753771
cluster 0.030297 0.042482 -0.000241 -0.016333 -0.040569 0.024055 -0.012000 0.027412 0.241434 0.651369 1.000000 -0.015637 -0.766240
dbscan_cluster 0.011943 -0.027122 -0.054323 -0.023332 -0.014832 0.021042 -0.064942 -0.070786 0.170454 -0.082993 -0.015637 1.000000 -0.104322
agg_cluster 0.006436 -0.064909 0.024742 0.078715 0.064478 -0.048618 -0.000542 0.066477 -0.522622 -0.753771 -0.766240 -0.104322 1.000000

Formulate hypotheses related to your dataset, such as "there is a significant difference in wind speed between honest and dishonest data points."¶

In [27]:
honest_df = merged_df[merged_df['honest'] == 'true']
dishonest_df = merged_df[merged_df['honest'] == 'false']

#ttest_ind is a statistical method that is used to compare the means of two independent groups of data.

t_stat, t_pvalue = ttest_ind(honest_df['speed'], dishonest_df['speed'])
print("t-test:")
print("  t-statistic:", t_stat)
print("  p-value:", t_pvalue)
t-test:
  t-statistic: 0.28016801917050665
  p-value: 0.7794568968744121
In [28]:
#mannwhitneyu is a statistical method that is used to compare the distributions of two independent samples of data.

mw_stat, mw_pvalue = mannwhitneyu(honest_df['speed'], dishonest_df['speed'])
print("\nMann-Whitney U test:")
print("  U-statistic:", mw_stat)
print("  p-value:", mw_pvalue)
Mann-Whitney U test:
  U-statistic: 8539.5
  p-value: 0.7962009507861991

Perform statistical tests like t-tests or chi-squared tests to confirm or reject these hypotheses.¶

In [29]:
t_stat, t_pvalue = ttest_ind(honest_df['temperature'], dishonest_df['temperature'])
print("t-test:")
print("  t-statistic:", t_stat)
print("  p-value:", t_pvalue)
t-test:
  t-statistic: -1.5302179746204665
  p-value: 0.12655382391911785
In [30]:
contingency_table = pd.crosstab(merged_df['sender'], merged_df['honest'])
print("Contingency Table:")
contingency_table
Contingency Table:
Out[30]:
honest false true
sender
0385da65-9ab0-46ac-a6d2-462041bf1fe1 0 2
040a8064-3026-4249-82b8-75c8a1aae388 1 0
06d4ce0d-44bc-430d-807f-c79765424e6b 0 1
072d5042-68a7-489d-972b-b9e3074eaf2a 0 2
07432308-5a09-4e9a-9d1a-6314c59d741f 0 3
... ... ...
f90b1084-4a0f-47e6-9567-345700e794d5 0 2
fab75436-bea2-4040-9692-cdc8d0ffbfc7 0 1
fb2839d3-a266-46e6-a048-823b6706b1f6 0 2
fde2d368-f04d-4072-b04e-2980b0583999 0 2
fee60cd9-ff19-4f84-a156-03d99a91006e 0 1

339 rows × 2 columns

chi2_contingency is a statistical method used to test for the independence of two categorical variables in a contingency table.¶

In [31]:
chi2_stat, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(contingency_table)
print("\nChi-squared Test:")
print("  chi2-statistic:", chi2_stat)
print("  p-value:", chi2_pvalue)
print("  degrees of freedom:", chi2_dof)
Chi-squared Test:
  chi2-statistic: 537.0
  p-value: 2.9653844335890057e-11
  degrees of freedom: 338
In [32]:
fullfalse=contingency_table[contingency_table['false'] !=0]
fullfalse
Out[32]:
honest false true
sender
040a8064-3026-4249-82b8-75c8a1aae388 1 0
16555781-840a-42dd-9636-b2c849251f1b 1 0
22cd4c98-9162-4798-bd6d-cc013da972a5 1 0
23a6eb58-1ee3-4bc3-ad7e-a9e1d7d0f571 1 0
2becdd30-78f0-433e-bb6e-17a798c721a9 1 0
2cb28562-bc2e-457f-ac18-cf5c0662dd7e 1 0
31305180-08c8-44a7-b15d-c2980bdec662 2 0
467de89c-6638-4458-8fa9-e701afd83120 1 0
4f572263-4be2-436a-8e48-d9aa044de09d 1 0
54e07a5f-8fe0-45eb-90e0-e8762c21fe2c 1 0
66c4766a-1630-4b5a-b859-f09e97b4e616 1 0
6c192683-b510-47f4-b57f-28338454b3ea 1 0
7571ba27-71fa-4368-b4f5-5ce1223f95a5 2 0
7ecb595b-74cd-40e3-a73a-72c204d6439a 2 0
852f464a-b23c-4e09-a2b2-f41ed8997d4d 2 0
9403b508-516f-4ae1-80d2-e707862d0124 1 0
a1926bf3-7aa3-4466-990e-68322aa7a029 1 0
a34dffb9-03af-4b51-ab62-d7ba0ff954a2 1 0
c9ea49da-9bc3-450a-9959-619991d58f4a 2 0
ca8818bb-f56e-4b3b-8c96-9f6760134498 2 0
d9777e82-84c0-4a13-881a-73264ade6b88 2 0
da658947-0a2a-4b29-ad69-5792f0477a91 1 0
dc64a949-18ba-4329-99e9-9d65b6de8f35 1 0
e8a24155-20f3-44e6-ac5b-acdca7361c1c 2 0
e8fd134e-25f0-40ad-b60e-934bb334e7f8 1 0
In [33]:
falsecontent=merged_df[merged_df['honest']=='false']
falsecontent.reset_index(drop=True)
Out[33]:
sender receiver date temperature wind speed port addr peers honest month year day_of_week sender_encoded latitude longitude cluster dbscan_cluster agg_cluster
0 ca8818bb-f56e-4b3b-8c96-9f6760134498 41e0ab50-b719-406e-9ceb-ef02e6446633 2022-12-20 06:12:03 26 4 106 28292 11.253.5.52 [163.179.28.48, 169.104.39.230, 137.218.170.21... false 12 2022 1 306 37.7510 -97.8220 0 0 1
1 ca8818bb-f56e-4b3b-8c96-9f6760134498 ddb3af13-5834-48d3-87b5-eab45b909df2 2022-09-07 18:06:55 24 1 131 28292 11.253.5.52 [163.179.28.48, 169.104.39.230, 137.218.170.21... false 9 2022 2 306 37.7510 -97.8220 0 0 1
2 54e07a5f-8fe0-45eb-90e0-e8762c21fe2c 7dd00fd4-7ca5-4098-80d0-ae6ef8f17d45 2022-11-05 07:39:39 30 16 106 20633 131.227.31.206 [166.143.47.13, 55.195.18.250, 184.193.5.40, 1... false 11 2022 5 137 51.2167 -0.5667 2 0 0
3 31305180-08c8-44a7-b15d-c2980bdec662 2ca1f8d3-b7bc-4e71-91b7-98c0ed150783 2022-12-23 09:55:47 29 2 119 36164 181.111.234.129 [19.98.38.90, 20.189.192.124, 165.185.167.47, ... false 12 2022 4 77 -34.6033 -58.3817 0 0 2
4 31305180-08c8-44a7-b15d-c2980bdec662 0998bce7-f2a6-4209-8f27-372d5dad026c 2022-12-28 09:32:51 28 13 118 36164 181.111.234.129 [19.98.38.90, 20.189.192.124, 165.185.167.47, ... false 12 2022 2 77 -34.6033 -58.3817 0 0 2
5 467de89c-6638-4458-8fa9-e701afd83120 ea7a0491-f376-405c-b783-640ed4d22881 2023-01-07 10:49:48 22 7 195 33442 218.12.116.115 [65.126.189.120, 227.253.255.235, 163.179.28.4... false 1 2023 5 116 39.8897 115.2750 1 0 0
6 c9ea49da-9bc3-450a-9959-619991d58f4a 4cd58a99-ac8e-42ea-bfe6-bcd3081b12d9 2022-09-28 04:21:58 21 2 132 26354 182.213.244.139 [237.115.30.245, 152.16.93.56, 95.133.14.18, 1... false 9 2022 2 305 37.5112 126.9741 1 0 0
7 c9ea49da-9bc3-450a-9959-619991d58f4a c5ce06d0-4d16-4357-ad5d-40cbfa28759e 2022-11-22 04:22:01 21 6 105 26354 182.213.244.139 [237.115.30.245, 152.16.93.56, 95.133.14.18, 1... false 11 2022 1 305 37.5112 126.9741 1 0 0
8 7ecb595b-74cd-40e3-a73a-72c204d6439a ee013164-f367-44aa-9ee4-af0e93a7266c 2023-01-17 01:11:01 26 7 135 29864 67.219.82.92 [241.68.182.221, 225.87.130.255, 36.238.70.103... false 1 2023 1 190 39.6347 -87.2639 0 0 1
9 7ecb595b-74cd-40e3-a73a-72c204d6439a 234324db-3aec-4b5a-b2bc-9bd5eafaacdb 2022-10-10 04:55:49 30 3 200 29864 67.219.82.92 [241.68.182.221, 225.87.130.255, 36.238.70.103... false 10 2022 0 190 39.6347 -87.2639 0 0 1
10 a1926bf3-7aa3-4466-990e-68322aa7a029 db2c61c2-d2d9-4d6b-9a15-4040e65a3eaf 2022-10-22 22:25:14 27 0 179 34788 157.31.252.164 [152.16.93.56, 61.124.213.100, 24.106.152.13, ... false 10 2022 5 241 33.4499 -112.0712 0 0 1
11 da658947-0a2a-4b29-ad69-5792f0477a91 b4c691de-b4d4-4a70-8ea1-60f7d1eeaf85 2022-10-03 17:49:59 25 9 191 30640 149.104.28.112 [212.208.191.97, 132.254.204.81, 201.177.102.2... false 10 2022 0 324 37.7510 -97.8220 0 0 1
12 e8a24155-20f3-44e6-ac5b-acdca7361c1c b58f8c37-5bac-47f8-b653-52914f3ba2dd 2022-11-02 21:03:00 25 8 147 31343 212.208.191.97 [56.48.105.145, 26.231.127.224, 26.245.153.56,... false 11 2022 2 342 48.7649 1.9292 2 0 0
13 e8a24155-20f3-44e6-ac5b-acdca7361c1c c7ae9dd6-a1a0-49f2-a344-3b4eff4569d4 2022-11-07 09:16:10 28 6 189 31343 212.208.191.97 [56.48.105.145, 26.231.127.224, 26.245.153.56,... false 11 2022 0 342 48.7649 1.9292 2 0 0
14 2becdd30-78f0-433e-bb6e-17a798c721a9 a1926bf3-7aa3-4466-990e-68322aa7a029 2023-01-27 11:46:32 21 10 161 28575 43.103.237.170 [65.90.216.7, 23.155.172.158, 211.209.228.35, ... false 1 2023 4 67 35.6850 139.7514 1 0 0
15 4f572263-4be2-436a-8e48-d9aa044de09d 26d1d8b0-39ca-4e9c-9c39-0776bf63b370 2022-12-21 20:15:16 21 2 108 28150 173.217.108.104 [234.116.2.6, 27.56.41.235, 73.194.2.180, 86.7... false 12 2022 2 130 31.6835 -93.1197 0 0 1
16 2cb28562-bc2e-457f-ac18-cf5c0662dd7e 6fdbb56b-a0bd-4afc-aada-3dff045341f7 2022-09-04 13:17:07 29 2 144 35973 178.18.13.198 [119.15.105.40, 73.194.2.180, 153.127.186.137,... false 9 2022 6 70 53.5303 49.3461 2 0 0
17 a34dffb9-03af-4b51-ab62-d7ba0ff954a2 3223c80f-ee5e-433d-a4ec-da0593588f36 2022-09-21 00:56:57 26 14 157 24713 61.225.107.205 [68.246.74.148, 50.151.200.203, 37.238.110.20,... false 9 2022 2 245 23.5000 121.0000 1 0 0
18 7571ba27-71fa-4368-b4f5-5ce1223f95a5 c9ea49da-9bc3-450a-9959-619991d58f4a 2022-12-16 04:53:54 29 8 128 34410 150.209.40.178 [182.213.244.139, 178.18.13.198, 98.170.14.91,... false 12 2022 4 179 43.0361 -75.3817 0 0 1
19 7571ba27-71fa-4368-b4f5-5ce1223f95a5 0ed4ea90-7224-4654-8c53-d7084ab7718d 2022-10-27 04:41:10 22 15 177 34410 150.209.40.178 [182.213.244.139, 178.18.13.198, 98.170.14.91,... false 10 2022 3 179 43.0361 -75.3817 0 0 1
20 16555781-840a-42dd-9636-b2c849251f1b 3f59a5f8-b903-48f2-9691-907b3046ecb1 2022-09-22 09:19:23 28 7 104 24089 76.99.145.77 [33.6.151.178, 71.190.70.177, 93.26.72.207, 14... false 9 2022 3 33 40.2933 -75.7441 0 0 1
21 d9777e82-84c0-4a13-881a-73264ade6b88 7bcdf7b7-35dd-49a1-be6c-4e8aaf4a8811 2023-01-08 07:42:23 28 10 170 36417 23.253.209.155 [35.251.80.187, 6.231.245.255, 220.214.217.101... false 1 2023 6 322 29.4889 -98.3987 0 0 1
22 d9777e82-84c0-4a13-881a-73264ade6b88 218b4f1d-8977-4a31-8ff9-c68b5db7cac0 2022-09-01 01:22:40 24 2 171 36417 23.253.209.155 [35.251.80.187, 6.231.245.255, 220.214.217.101... false 9 2022 3 322 29.4889 -98.3987 0 0 1
23 22cd4c98-9162-4798-bd6d-cc013da972a5 d2d64234-f05d-42f4-bf23-392c538da4e7 2023-01-10 21:06:40 23 12 152 28294 90.90.66.105 [4.130.37.44, 186.32.76.3, 153.119.59.158, 139... false 1 2023 1 49 48.9833 2.2333 2 0 0
24 040a8064-3026-4249-82b8-75c8a1aae388 aad31313-444b-4477-9df1-7806b0ae31af 2022-10-20 04:57:32 28 12 168 31852 90.36.7.56 [5.137.192.15, 67.202.183.135, 37.13.63.149, 2... false 10 2022 3 1 16.2333 -61.3667 0 0 1
25 852f464a-b23c-4e09-a2b2-f41ed8997d4d 6ea944b0-028a-4532-97b8-ce9ce1225fb2 2022-12-05 13:12:25 24 3 181 23050 24.223.71.151 [216.139.145.182, 67.35.12.255, 184.182.38.122... false 12 2022 0 198 41.7101 -103.6648 0 0 1
26 852f464a-b23c-4e09-a2b2-f41ed8997d4d b56d6a9c-7454-47c0-ad98-2e56a0b9d012 2022-12-30 12:49:50 26 10 170 23050 24.223.71.151 [216.139.145.182, 67.35.12.255, 184.182.38.122... false 12 2022 4 198 41.7101 -103.6648 0 0 1
27 dc64a949-18ba-4329-99e9-9d65b6de8f35 aba972fb-cc42-49e8-a5e0-546bf2de79d4 2022-10-05 05:11:54 30 7 199 29942 112.76.206.99 [24.223.71.151, 21.117.23.46, 87.231.93.255, 2... false 10 2022 2 327 37.5112 126.9741 1 0 0
28 e8fd134e-25f0-40ad-b60e-934bb334e7f8 dd9a8706-3f47-4a37-b864-74c84ce1e4c7 2022-09-04 19:42:19 26 12 101 25818 110.75.115.42 [245.37.200.78, 105.159.233.225, 149.21.126.20... false 9 2022 6 343 30.2936 120.1614 1 0 0
29 9403b508-516f-4ae1-80d2-e707862d0124 072d5042-68a7-489d-972b-b9e3074eaf2a 2022-09-29 16:34:30 26 14 156 33940 86.120.23.203 [132.38.19.227, 150.126.208.240, 217.101.36.18... false 9 2022 3 225 45.6333 25.5833 2 0 0
30 6c192683-b510-47f4-b57f-28338454b3ea 86b1dd0b-ddef-4968-88f7-bda1aaeebfb6 2022-10-14 16:22:05 20 1 185 35158 152.190.100.232 [87.91.229.167, 216.37.69.51, 235.226.18.8, 20... false 10 2022 4 163 39.0180 -77.5390 0 0 1
31 23a6eb58-1ee3-4bc3-ad7e-a9e1d7d0f571 c7dc8e68-ccc1-479f-9d6e-076d8d9d6fbe 2023-01-14 02:02:36 24 1 120 39825 176.40.17.118 [147.242.90.109, 180.231.161.196, 22.245.208.1... false 1 2023 5 52 39.9264 32.8888 2 0 0
32 66c4766a-1630-4b5a-b859-f09e97b4e616 66a38013-24fd-4806-8205-bd0a5bb96093 2023-01-11 23:55:31 30 3 103 37407 17.157.180.94 [33.139.142.206, 87.196.152.101, 176.40.17.118... false 1 2023 2 157 37.3042 -122.0946 0 0 1

A Geographical view of the dis-honest nodes¶

In [34]:
# Initialize the GeoLite2 reader
reader = geolite2.reader()
# Close the GeoLite2 reader
geolite2.close()

# Create a map centered at the mean latitude and longitude values
map_center = [falsecontent['latitude'].mean(), falsecontent['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=2)

# Add markers for each IP address
for index, row in falsecontent.iterrows():
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        folium.Marker([row['latitude'], row['longitude']], tooltip=row['addr']).add_to(m)

# Display the map
m
Out[34]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: